First, we need to read in the data and join the training and test data.
train_data <- read_csv("reddit_stress_data/dreaddit-train.csv", show_col_types = FALSE)
test_data <- read_csv("reddit_stress_data/dreaddit-test.csv", show_col_types = FALSE)
reddit_stress_data <- add_row(train_data, test_data)
Now we need to find the word distributions. We’ll start by unnesting the tokens and training this on the full dataset.
words_tokenized <- reddit_stress_data %>%
select(c("id", "text", "label", "subreddit")) %>%
unnest_tokens(word, text) %>%
mutate(word = gsub('[[:punct:]]+','', word)) %>%
mutate(word = gsub('\\<[[:digit:]]+\\>', '%d%', word)) %>%
anti_join(stop_words)
## Joining, by = "word"
head(words_tokenized)
## # A tibble: 6 x 4
## id label subreddit word
## <dbl> <dbl> <chr> <chr>
## 1 33181 1 ptsd suggeted
## 2 33181 1 ptsd rest
## 3 33181 1 ptsd trigger
## 4 33181 1 ptsd ahead
## 5 33181 1 ptsd youire
## 6 33181 1 ptsd hypocondriac
label_counts <- reddit_stress_data %>%
group_by(label) %>%
count()
plot_ly(label_counts, x = ~label, y = ~n, type = "bar")
label_counts
## # A tibble: 2 x 2
## # Groups: label [2]
## label n
## <dbl> <int>
## 1 0 1696
## 2 1 1857
subreddit_counts <- reddit_stress_data %>%
group_by(subreddit) %>%
count()
subreddit_counts
## # A tibble: 10 x 2
## # Groups: subreddit [10]
## subreddit n
## <chr> <int>
## 1 almosthomeless 99
## 2 anxiety 650
## 3 assistance 355
## 4 domesticviolence 388
## 5 food_pantry 43
## 6 homeless 220
## 7 ptsd 711
## 8 relationships 694
## 9 stress 78
## 10 survivorsofabuse 315
plot_ly(subreddit_counts, x = ~subreddit, y = ~n, type = "bar")
# Stacked
reddit_stress_data %>%
ggplot(aes(y=subreddit)) + geom_bar(aes(fill = as.factor(label)), position="stack")
Now let’s see the most common words among the data (overall).
GetTopNMostCommonWords <- function(df, num) {
top_word_counts <- df %>%
count(word) %>%
arrange(desc(n))
return (head(top_word_counts, num))
}
num <- 15
top_10_full_data <- GetTopNMostCommonWords(words_tokenized, num)
Now I will plot the rop 20 most common words in the dataset
ggplot(top_10_full_data, aes(x = reorder(word, desc(n)), y = n)) + geom_col(fill = "steelblue") + labs(title = "Top 10 Words from the Full Dataset", x = "Word", y = "Frequency")
Now let’s see how this varies among label: stressed or non-stressed.
stressed_data <- filter(words_tokenized, label == 0)
non_stressed_data <- filter(words_tokenized, label == 1)
Now let’s plot them
ggplot(GetTopNMostCommonWords(non_stressed_data, num), aes(x = reorder(word, desc(n)), y = n)) + geom_col(fill = "steelblue") + labs(title = "Top 10 Words from the Non-Stressed Dataset", x = "Word", y = "Frequency")
Now let’s see the difference among stressed data.
ggplot(GetTopNMostCommonWords(stressed_data, num), aes(x = reorder(word, desc(n)), y = n)) + geom_col(fill = "steelblue") + labs(title = "Top 10 Words from the Stressed Dataset", x = "Word", y = "Frequency")
# Visualizing the Distribution of Words By Label
words_tokenized_by_subreddit_counts <- reddit_stress_data %>%
select(c("id", "text", "label", "subreddit")) %>%
unnest_tokens(word, text) %>%
mutate(word = gsub('[[:punct:]]+','', word)) %>%
mutate(word = gsub('\\<[[:digit:]]+\\>', '%d%', word)) %>%
anti_join(stop_words) %>%
group_by(subreddit) %>%
count(word) %>%
arrange(subreddit, desc(n))
## Joining, by = "word"
head(words_tokenized_by_subreddit_counts)
## # A tibble: 6 x 3
## # Groups: subreddit [1]
## subreddit word n
## <chr> <chr> <int>
## 1 almosthomeless im 76
## 2 almosthomeless %d% 69
## 3 almosthomeless job 32
## 4 almosthomeless dont 31
## 5 almosthomeless time 28
## 6 almosthomeless ive 25
words_tokenized_by_subreddit_counts <- reddit_stress_data %>%
select(c("id", "text", "label", "subreddit")) %>%
unnest_tokens(word, text) %>%
mutate(word = gsub('[[:punct:]]+','', word)) %>%
mutate(word = gsub('\\<[[:digit:]]+\\>', '%d%', word)) %>%
anti_join(stop_words) %>%
group_by(label) %>%
count(word) %>%
arrange(label, desc(n))
## Joining, by = "word"
head(words_tokenized_by_subreddit_counts)
## # A tibble: 6 x 3
## # Groups: label [1]
## label word n
## <dbl> <chr> <int>
## 1 0 %d% 1200
## 2 0 im 659
## 3 0 time 450
## 4 0 dont 391
## 5 0 ive 331
## 6 0 people 316
words_tokenized_by_subreddit_counts %>%
top_n(10, n) %>%
ungroup() %>%
mutate(label = as.factor(label)) %>%
arrange(label, n) %>%
mutate(topic_r = row_number()) %>%
ggplot(aes(word, n, fill = label)) +
geom_col() +
facet_wrap(~ label, scales = "free") + theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))
Now let’s plot them
words_tokenized_by_subreddit_counts <- reddit_stress_data %>%
select(c("id", "text", "label", "subreddit")) %>%
unnest_tokens(word, text) %>%
mutate(word = gsub('[[:punct:]]+','', word)) %>%
mutate(word = gsub('\\<[[:digit:]]+\\>', '%d%', word)) %>%
anti_join(stop_words) %>%
group_by(subreddit) %>%
count(word) %>%
arrange(subreddit, desc(n))
## Joining, by = "word"
head(words_tokenized_by_subreddit_counts)
## # A tibble: 6 x 3
## # Groups: subreddit [1]
## subreddit word n
## <chr> <chr> <int>
## 1 almosthomeless im 76
## 2 almosthomeless %d% 69
## 3 almosthomeless job 32
## 4 almosthomeless dont 31
## 5 almosthomeless time 28
## 6 almosthomeless ive 25
words_tokenized_by_subreddit_counts %>%
top_n(10, n) %>%
ungroup() %>%
arrange(subreddit, n) %>%
mutate(topic_r = row_number()) %>%
ggplot(aes(word, n, fill = subreddit)) +
geom_col() +
facet_wrap(~ subreddit, scales = "free") + theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))
ggplot(reddit_stress_data, aes(x = sentiment)) + geom_boxplot(fill = "steelblue", bins = 50) + labs(title = "Distribution of Sentiment")
## Warning: Ignoring unknown parameters: bins
mx <- 0
ggplot(reddit_stress_data, aes(x = sentiment)) + geom_histogram(fill = "steelblue", bins = 50) + labs(title = "Distribution of Sentiment") + geom_vline(xintercept = mx, col = "red", lwd = 1) + annotate("text", x = 0.1, y = 400, label = "Neutral")
## By Label
ggplot(reddit_stress_data, aes(x = sentiment)) + geom_boxplot(fill = "steelblue") + labs(title = "Distribution of Sentiment") + facet_wrap(~ label)
ggplot(reddit_stress_data, aes(x = sentiment)) + geom_histogram(fill = "steelblue", bins = 50) + labs(title = "Distribution of Sentiment") + facet_wrap(~ label) + geom_vline(xintercept = mx, col = "red", lwd = 1) + annotate("text", x = 0.2, y = 400, label = "Neutral")
## By Subreddit
ggplot(reddit_stress_data, aes(x = sentiment)) + geom_boxplot(fill = "steelblue") + labs(title = "Distribution of Sentiment") + facet_wrap(~ subreddit)
ggplot(reddit_stress_data, aes(x = sentiment)) + geom_boxplot(fill = "steelblue") + labs(title = "Distribution of Sentiment") + facet_grid(subreddit ~ label, switch ="y") + theme(strip.text.y.left = element_text(angle = 0))